2D SPLINE MEGVALÓSÍTÁSA MGCV KÖNYVTÁR SEGÍTSÉGÉVEL
compute_expected_2d_mgcv <- function(
counts,
exclude = NULL,
include.trend = TRUE,
k_time = 20,
k_age = 10,
k_season = 10,
frequency = NULL, # most már csak meta infó
weekday.effect = FALSE,
age_var = "age_mid",
verbose = TRUE
) {
# --- 1. Ellenőrzések -------------------------------------------------------
required_cols <- c("date", "outcome", age_var)
if (!all(required_cols %in% names(counts))) {
stop("Hiányzó oszlop(ok): ", paste(required_cols, collapse = ", "))
}
if (!lubridate::is.Date(counts$date))
stop("A date oszlopnak Date osztályúnak kell lennie.")
# population oszlop: ha nincs, legyen 1 (offset nélkül kb. ugyanaz)
if (!("population" %in% names(counts))) {
if (verbose) message("Nincs 'population' oszlop, beállítom 1-re.")
counts$population <- 1
}
# --- 2. Frequency becslés (nem kritikus, de elmentjük meta infónak) -------
unique_dates <- sort(unique(counts$date))
if (is.null(frequency)) {
total_days <- as.numeric(max(unique_dates) - min(unique_dates))
n_time <- length(unique_dates)
frequency <- round(365 / (total_days / n_time))
if (verbose)
message("Detectált frequency: ", frequency)
}
# --- 3. Segédfüggvény: szökőévek kezelése (Acosta–Irizarry logika) -------
noleap_yday <- function(date){
yd <- lubridate::yday(date)
yd[lubridate::leap_year(date) & yd > 59] <- yd[lubridate::leap_year(date) & yd > 59] - 1
yd
}
# --- 4. Extra kovariáták ---------------------------------------------------
min_date <- min(counts$date)
counts <- counts %>%
mutate(
time_scaled = as.numeric(date - min_date) / 365.25, # idő években, 0-tól indul
age_s = .data[[age_var]],
doy = noleap_yday(date),
wday = lubridate::wday(date),
doy_scaled = 2 * pi * doy / 365 # ciklikus inputnak kényelmes
)
# --- 5. Train–test bontás ---------------------------------------------------
index <- !(counts$date %in% exclude)
counts_fit <- counts[index, ]
counts_pred <- counts
if (!any(index)) stop("Nincs tanító adat (minden dátum kizárva).")
# --- 6. GAM formula ---------------------------------------------------------
# - te(time_scaled, age_s): hosszú távú trend + age-hatás
# - s(doy_scaled, bs="cc"): ciklikus szezonális komponens
if (weekday.effect) {
gam_formula <- outcome ~
te(time_scaled, age_s, bs = c("tp","tp"),
k = c(k_time, k_age)) +
s(doy_scaled, bs = "cc", k = k_season) +
factor(wday)
} else {
gam_formula <- outcome ~
te(time_scaled, age_s, bs = c("tp","tp"),
k = c(k_time, k_age)) +
s(doy_scaled, bs = "cc", k = k_season)
}
if (verbose) {
message("GAM formula:\n", deparse(gam_formula))
}
# --- 7. Modell illesztése ---------------------------------------------------
fit <- mgcv::gam(
formula = gam_formula,
family = quasipoisson(link = "log"),
offset = log(population),
data = counts_fit,
method = "REML"
)
# --- 8. Előrejelzés (train + test) -----------------------------------------
eta_all <- predict(fit, newdata = counts_pred, type = "link", se.fit = FALSE)
mu_all <- exp(eta_all)
# (ha kellene: se.fit = TRUE és Delta-módszerrel log_expected_se, de most 0-ra állítjuk)
log_expected_se <- rep(0, nrow(counts_pred))
# --- 9. Kimenet összeállítása ----------------------------------------------
out <- counts_pred %>%
mutate(
expected = mu_all,
log_expected_se = log_expected_se,
excluded = !index
)
attr(out, "model") <- fit
attr(out, "frequency") <- frequency
attr(out, "type") <- "mgcv_2d_spline_cc"
return(out)
}
Adatok importálása
data <- as.data.frame(read.csv2("df_results_23_years_version2_copied.csv",
header = TRUE,
check.names = FALSE,
sep = ","))
data$date <- as.Date(data$date)
# wide -> long
data_long <- data %>%
pivot_longer(
cols = -date,
names_to = "age_group",
values_to = "outcome"
) %>%
mutate(
age_mid = case_when(
age_group == "80+" ~ 82.5,
TRUE ~ (as.numeric(sub("-.*", "", age_group)) +
as.numeric(sub(".*-", "", age_group))) / 2
),
population = 1
)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `age_mid = case_when(...)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
Adatelemzés
20 év tanítás + 1 év előrejelzés
all_dates <- sort(unique(data_long$date))
start_date <- min(all_dates)
train_end_date <- start_date + lubridate::years(20)
train_dates <- all_dates[all_dates < train_end_date]
test_start_date <- train_end_date
test_end_date <- train_end_date + lubridate::years(1)
test_dates <- all_dates[all_dates >= test_start_date &
all_dates < test_end_date]
expected_all <- compute_expected_2d_mgcv(
counts = data_long,
exclude = test_dates,
age_var = "age_mid",
k_time = 20,
k_age = 10,
k_season = 10,
weekday.effect = FALSE
)
## Detectált frequency: 52
## GAM formula:
## outcome ~ te(time_scaled, age_s, bs = c("tp", "tp"), k = c(k_time, k_age)) + s(doy_scaled, bs = "cc", k = k_season)
TRAIN - TEST összevető táblázat
# TRAIN - TEST szétválasztás
results <- expected_all
# train adatok (tanulási 20 év)
train_df <- results %>%
filter(!excluded) %>%
select(date, age_mid, observed = outcome, expected)
# test adatok (1 év előrejelzés)
test_df <- results %>%
filter(excluded) %>%
select(date, age_mid, observed = outcome, expected)
Hibamértékek (MSE, RMSE, MAE)
# Error metrikák kiszámítása teszt időszakra
error_metrics <- test_df %>%
mutate(error = observed - expected) %>%
summarise(
MSE = mean(error^2),
RMSE = sqrt(mean(error^2)),
MAE = mean(abs(error))
)
error_metrics
## # A tibble: 1 × 3
## MSE RMSE MAE
## <dbl> <dbl> <dbl>
## 1 142611. 378. 99.9
error_by_age <- test_df %>%
mutate(error = observed - expected) %>%
group_by(age_mid) %>%
summarise(
MSE = mean(error^2),
RMSE = sqrt(mean(error^2)),
MAE = mean(abs(error))
)
error_by_age
## # A tibble: 17 × 4
## age_mid MSE RMSE MAE
## <dbl> <dbl> <dbl> <dbl>
## 1 2 34.2 5.85 3.90
## 2 7 3.64 1.91 1.82
## 3 12 0.669 0.818 0.617
## 4 17 2.12 1.46 1.03
## 5 22 3.56 1.89 1.32
## 6 27 7.30 2.70 2.26
## 7 32 14.4 3.79 2.80
## 8 37 38.0 6.17 4.11
## 9 42 59.3 7.70 5.63
## 10 47 207. 14.4 10.7
## 11 52 662. 25.7 17.9
## 12 57 1820. 42.7 27.7
## 13 62 4433. 66.6 44.9
## 14 67 14370. 120. 87.9
## 15 72 90267. 300. 191.
## 16 77 310930. 558. 376.
## 17 82.5 2001535. 1415. 918.
Ábrázolás
# fitted GAM modell
fit <- attr(expected_all, "model")
# rács létrehozása idő × kor dimenzióban
time_seq <- seq(
min(data_long$date),
max(data_long$date),
length.out = 80
)
age_seq <- seq(
min(data_long$age_mid),
max(data_long$age_mid),
length.out = 40
)
grid <- expand.grid(
date = time_seq,
age_mid = age_seq
)
# a compute_expected_2d_mgcv-ben használt transzformációkat újra kell alkalmazni:
min_date <- min(data_long$date)
grid <- grid %>%
mutate(
time_scaled = as.numeric(date - min_date) / 365.25,
age_s = age_mid,
doy = {
yd <- lubridate::yday(date)
yd[lubridate::leap_year(date) & yd > 59] <- yd[lubridate::leap_year(date) & yd > 59] - 1
yd
},
doy_scaled = 2 * pi * doy / 365,
population = 1 # vagy ha van valódi populáció, azt betenni
)
Z_vec <- predict(fit, newdata = grid, type = "response")
Z <- matrix(Z_vec,
nrow = length(time_seq),
ncol = length(age_seq),
byrow = FALSE)
plot_ly(
x = time_seq,
y = age_seq,
z = ~Z,
type = "surface"
) %>%
layout(
scene = list(
xaxis = list(title = "Dátum"),
yaxis = list(title = "Korcsoport (age_mid)"),
zaxis = list(title = "Várt halálozás")
)
)
Ábra korcsoportonként (face-grid)
ggplot(results, aes(x = date)) +
geom_line(aes(y = outcome, color = "Observed"), size = 0.6) +
geom_line(aes(y = expected, color = "Expected"), size = 0.6) +
facet_wrap(~ age_mid, scales = "free_y") +
scale_color_manual(values = c("Observed" = "black", "Expected" = "blue")) +
labs(title = "Observed vs Expected by Age Group",
color = "") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Az előrejelzett év ábrázolása
test_total <- test_df %>%
group_by(date) %>%
summarise(
observed = sum(observed),
expected = sum(expected),
.groups = "drop"
)
ggplot(test_total, aes(x = date)) +
geom_line(aes(y = observed, color = "Observed"), linewidth = 0.6) +
geom_line(aes(y = expected, color = "Expected"), linewidth = 1.1) +
scale_color_manual(
values = c(
"Observed" = "#0f77b4",
"Expected" = "#ff7f0e"
),
labels = c(
"Observed" = "Megfigyelt halálozás",
"Expected" = "Prediktált halálozás"
),
name = NULL
) +
labs(
x = "Dátum",
y = "Halálozási esetszám"
) +
theme_minimal() +
theme(
legend.position = "bottom"
)

Az előrejelzett év ábrázolása korcsoportonként
test_only <- test_df
ggplot(test_only, aes(x = date)) +
geom_line(aes(y = observed, color = "Observed"), linewidth = 0.7) +
geom_line(aes(y = expected, color = "Expected"), linewidth = 0.7) +
facet_wrap(~ age_mid, scales = "free_y") +
scale_color_manual(values = c("Observed" = "black", "Expected" = "blue")) +
labs(
title = "Observed vs Expected – 1-Year Forecast by Age Group",
x = "Date",
y = "Mortality",
color = ""
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.text = element_text(size = 10)
)

20 év tanítás + 2 év előrejelzés
test_end_date_2 <- train_end_date + lubridate::years(2)
test_dates_2 <- all_dates[all_dates >= test_start_date &
all_dates < test_end_date_2]
expected_all_2 <- compute_expected_2d_mgcv(
counts = data_long,
exclude = test_dates_2,
age_var = "age_mid",
k_time = 20,
k_age = 10,
k_season = 10,
weekday.effect = FALSE
)
## Detectált frequency: 52
## GAM formula:
## outcome ~ te(time_scaled, age_s, bs = c("tp", "tp"), k = c(k_time, k_age)) + s(doy_scaled, bs = "cc", k = k_season)
TRAIN- TEST összevető táblázat
results_2 <- expected_all_2
# train adatok
train_df_2 <- results_2 %>%
filter(!excluded) %>%
select(date, age_mid, observed = outcome, expected)
# test adatok
test_df_2 <- results_2 %>%
filter(excluded) %>%
select(date, age_mid, observed = outcome, expected)
Hibamérték (MSE, RMSE, MAE)
# Error metrikák kiszámítása teszt időszakra
error_metrics_2 <-test_df_2 %>%
mutate(error = observed - expected) %>%
summarise(
MSE = mean(error^2),
RMSE = sqrt(mean(error^2)),
MAE = mean(abs(error))
)
error_metrics_2
## # A tibble: 1 × 3
## MSE RMSE MAE
## <dbl> <dbl> <dbl>
## 1 228546. 478. 108.
error_by_age_2 <- test_df_2 %>%
mutate(error = observed - expected) %>%
group_by(age_mid) %>%
summarise(
MSE = mean(error^2),
RMSE = sqrt(mean(error^2)),
MAE = mean(abs(error))
)
error_by_age_2
## # A tibble: 17 × 4
## age_mid MSE RMSE MAE
## <dbl> <dbl> <dbl> <dbl>
## 1 2 47.5 6.89 3.91
## 2 7 3.83 1.96 1.88
## 3 12 0.741 0.861 0.569
## 4 17 2.29 1.51 0.940
## 5 22 4.31 2.08 1.25
## 6 27 8.97 2.99 2.26
## 7 32 19.6 4.42 2.77
## 8 37 50.9 7.13 4.01
## 9 42 84.1 9.17 5.73
## 10 47 297. 17.2 11.0
## 11 52 968. 31.1 18.4
## 12 57 2647. 51.4 28.5
## 13 62 6649. 81.5 46.8
## 14 67 21846. 148. 93.7
## 15 72 137655. 371. 200.
## 16 77 493652. 703. 408.
## 17 82.5 3221339. 1795. 1002.
20 év tanítás + 3 év előrejelzés
test_end_date_3 <- train_end_date + lubridate::years(3)
test_dates_3 <- all_dates[all_dates >= test_start_date &
all_dates < test_end_date_3]
expected_all_3 <- compute_expected_2d_mgcv(
counts = data_long,
exclude = test_dates_3,
age_var = "age_mid",
k_time = 20,
k_age = 10,
k_season = 10,
weekday.effect = FALSE
)
## Detectált frequency: 52
## GAM formula:
## outcome ~ te(time_scaled, age_s, bs = c("tp", "tp"), k = c(k_time, k_age)) + s(doy_scaled, bs = "cc", k = k_season)
TRAIN- TEST összevető táblázat
results_3 <- expected_all_3
# train adatok
train_df_3 <- results_3 %>%
filter(!excluded) %>%
select(date, age_mid, observed = outcome, expected)
# test adatok
test_df_3 <- results_3 %>%
filter(excluded) %>%
select(date, age_mid, observed = outcome, expected)
Hibamérték (MSE, RMSE, MAE)
# Error metrikák kiszámítása teszt időszakra
error_metrics_3 <-test_df_3 %>%
mutate(error = observed - expected) %>%
summarise(
MSE = mean(error^2),
RMSE = sqrt(mean(error^2)),
MAE = mean(abs(error))
)
error_metrics_3
## # A tibble: 1 × 3
## MSE RMSE MAE
## <dbl> <dbl> <dbl>
## 1 200733. 448. 109.
error_by_age_3 <- test_df_3 %>%
mutate(error = observed - expected) %>%
group_by(age_mid) %>%
summarise(
MSE = mean(error^2),
RMSE = sqrt(mean(error^2)),
MAE = mean(abs(error))
)
error_by_age_3
## # A tibble: 17 × 4
## age_mid MSE RMSE MAE
## <dbl> <dbl> <dbl> <dbl>
## 1 2 42.9 6.55 4.00
## 2 7 3.79 1.95 1.86
## 3 12 0.620 0.788 0.533
## 4 17 2.12 1.46 0.966
## 5 22 3.77 1.94 1.24
## 6 27 7.73 2.78 2.15
## 7 32 17.3 4.16 2.71
## 8 37 47.0 6.85 4.21
## 9 42 73.4 8.57 5.65
## 10 47 258. 16.1 10.8
## 11 52 850. 29.2 18.4
## 12 57 2374. 48.7 29.3
## 13 62 5836. 76.4 46.9
## 14 67 18638. 137. 90.6
## 15 72 123889. 352. 209.
## 16 77 428825. 655. 404.
## 17 82.5 2831590. 1683. 1014.